In [1]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import scipy as sp


import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# --- setup ---
pd.set_option('max_columns', 50)
In [2]:
# Input data files are available in the "../input/" directory.
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     filenames.sort()
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
In [3]:
%%time
import requests

for filename in ['time_series_covid19_confirmed_global.csv',
                 'time_series_covid19_deaths_global.csv',
                 'time_series_covid19_recovered_global.csv',
                 'time_series_covid19_confirmed_US.csv',
                 'time_series_covid19_deaths_US.csv']:
    print(f'Downloading {filename}')
    url = f'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/{filename}'
    myfile = requests.get(url)
    open(filename, 'wb').write(myfile.content)
Downloading time_series_covid19_confirmed_global.csv
Downloading time_series_covid19_deaths_global.csv
Downloading time_series_covid19_recovered_global.csv
Downloading time_series_covid19_confirmed_US.csv
Downloading time_series_covid19_deaths_US.csv
Wall time: 625 ms
In [4]:
from datetime import datetime

def _convert_date_str(df):
    try:
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]
    except:
        print('_convert_date_str failed with %y, try %Y')
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%Y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]


confirmed_global_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
_convert_date_str(confirmed_global_df)

deaths_global_df = pd.read_csv('time_series_covid19_deaths_global.csv')
_convert_date_str(deaths_global_df)

recovered_global_df = pd.read_csv('time_series_covid19_recovered_global.csv')
_convert_date_str(recovered_global_df)
In [5]:
# Filter out problematic data points (The West Bank and Gaza had a negative value, cruise ships were associated with Canada, etc.)
removed_states = "Recovered|Grand Princess|Diamond Princess"
removed_countries = "US|The West Bank and Gaza"

confirmed_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
deaths_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
recovered_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)

confirmed_global_df = confirmed_global_df[~confirmed_global_df["Province_State"].replace(np.nan, "nan").str.match(removed_states)]
deaths_global_df    = deaths_global_df[~deaths_global_df["Province_State"].replace(np.nan, "nan").str.match(removed_states)]
recovered_global_df = recovered_global_df[~recovered_global_df["Province_State"].replace(np.nan, "nan").str.match(removed_states)]

confirmed_global_df = confirmed_global_df[~confirmed_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]
deaths_global_df    = deaths_global_df[~deaths_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]
recovered_global_df = recovered_global_df[~recovered_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]
In [6]:
confirmed_global_melt_df = confirmed_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='ConfirmedCases')
deaths_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Deaths')
recovered_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Recovered')
In [7]:
train = confirmed_global_melt_df.merge(deaths_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])
train = train.merge(recovered_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])
In [8]:
# --- US ---
confirmed_us_df = pd.read_csv('time_series_covid19_confirmed_US.csv')
deaths_us_df = pd.read_csv('time_series_covid19_deaths_US.csv')

confirmed_us_df.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'], inplace=True, axis=1)
deaths_us_df.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key', 'Population'], inplace=True, axis=1)

confirmed_us_df.rename({'Long_': 'Long'}, axis=1, inplace=True)
deaths_us_df.rename({'Long_': 'Long'}, axis=1, inplace=True)

_convert_date_str(confirmed_us_df)
_convert_date_str(deaths_us_df)

# clean
confirmed_us_df = confirmed_us_df[~confirmed_us_df.Province_State.str.match("Diamond Princess|Grand Princess|Recovered|Northern Mariana Islands|American Samoa")]
deaths_us_df = deaths_us_df[~deaths_us_df.Province_State.str.match("Diamond Princess|Grand Princess|Recovered|Northern Mariana Islands|American Samoa")]

# --- Aggregate by province state ---
#confirmed_us_df.groupby(['Country_Region', 'Province_State'])
confirmed_us_df = confirmed_us_df.groupby(['Country_Region', 'Province_State']).sum().reset_index()
deaths_us_df = deaths_us_df.groupby(['Country_Region', 'Province_State']).sum().reset_index()

# remove lat, long.
confirmed_us_df.drop(['Lat', 'Long'], inplace=True, axis=1)
deaths_us_df.drop(['Lat', 'Long'], inplace=True, axis=1)

confirmed_us_melt_df = confirmed_us_df.melt(
    id_vars=['Country_Region', 'Province_State'], value_vars=confirmed_us_df.columns[2:], var_name='Date', value_name='ConfirmedCases')
deaths_us_melt_df = deaths_us_df.melt(
    id_vars=['Country_Region', 'Province_State'], value_vars=deaths_us_df.columns[2:], var_name='Date', value_name='Deaths')

train_us = confirmed_us_melt_df.merge(deaths_us_melt_df, on=['Country_Region', 'Province_State', 'Date'])
In [9]:
train = pd.concat([train, train_us], axis=0, sort=False)

train_us.rename({'Country_Region': 'country', 'Province_State': 'province', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities'}, axis=1, inplace=True)
train_us['country_province'] = train_us['country'].fillna('') + '/' + train_us['province'].fillna('')
In [10]:
train
Out[10]:
Country_Region Province_State Lat Long Date ConfirmedCases Deaths Recovered
0 Afghanistan NaN 33.0000 65.0000 2020-01-22 0 0 0.0
1 Albania NaN 41.1533 20.1683 2020-01-22 0 0 0.0
2 Algeria NaN 28.0339 1.6596 2020-01-22 0 0 0.0
3 Andorra NaN 42.5063 1.5218 2020-01-22 0 0 0.0
4 Angola NaN -11.2027 17.8739 2020-01-22 0 0 0.0
... ... ... ... ... ... ... ... ...
5557 US Virginia NaN NaN 2020-05-03 18672 660 NaN
5558 US Washington NaN NaN 2020-05-03 15185 834 NaN
5559 US West Virginia NaN NaN 2020-05-03 1195 48 NaN
5560 US Wisconsin NaN NaN 2020-05-03 7964 339 NaN
5561 US Wyoming NaN NaN 2020-05-03 586 7 NaN

32548 rows × 8 columns

In [11]:
%%time
datadir = Path('/kaggle/input/covid19-global-forecasting-week-4')

# Read in the data CSV files
#train = pd.read_csv(datadir/'train.csv')
#test = pd.read_csv(datadir/'test.csv')
#submission = pd.read_csv(datadir/'submission.csv')
Wall time: 0 ns
In [12]:
train
Out[12]:
Country_Region Province_State Lat Long Date ConfirmedCases Deaths Recovered
0 Afghanistan NaN 33.0000 65.0000 2020-01-22 0 0 0.0
1 Albania NaN 41.1533 20.1683 2020-01-22 0 0 0.0
2 Algeria NaN 28.0339 1.6596 2020-01-22 0 0 0.0
3 Andorra NaN 42.5063 1.5218 2020-01-22 0 0 0.0
4 Angola NaN -11.2027 17.8739 2020-01-22 0 0 0.0
... ... ... ... ... ... ... ... ...
5557 US Virginia NaN NaN 2020-05-03 18672 660 NaN
5558 US Washington NaN NaN 2020-05-03 15185 834 NaN
5559 US West Virginia NaN NaN 2020-05-03 1195 48 NaN
5560 US Wisconsin NaN NaN 2020-05-03 7964 339 NaN
5561 US Wyoming NaN NaN 2020-05-03 586 7 NaN

32548 rows × 8 columns

In [13]:
train.rename({'Country_Region': 'country', 'Province_State': 'province', 'Id': 'id', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities', 'Recovered': 'recovered'}, axis=1, inplace=True)
train['country_province'] = train['country'].fillna('') + '/' + train['province'].fillna('')

# test.rename({'Country_Region': 'country', 'Province_State': 'province', 'Id': 'id', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Fatalities': 'fatalities'}, axis=1, inplace=True)
# test['country_province'] = test['country'].fillna('') + '/' + test['province'].fillna('')
In [14]:
ww_df = train.groupby('date')[['confirmed', 'fatalities']].sum().reset_index()
ww_df['new_case'] = ww_df['confirmed'] - ww_df['confirmed'].shift(1)
ww_df.tail()
Out[14]:
date confirmed fatalities new_case
98 2020-04-29 3172108 227659 75097.0
99 2020-04-30 3256674 233351 84566.0
100 2020-05-01 3343597 238613 86923.0
101 2020-05-02 3427163 243802 83566.0
102 2020-05-03 3506549 247464 79386.0
In [15]:
ww_melt_df = pd.melt(ww_df, id_vars=['date'], value_vars=['confirmed', 'fatalities', 'new_case'])
ww_melt_df
Out[15]:
date variable value
0 2020-01-22 confirmed 555.0
1 2020-01-23 confirmed 654.0
2 2020-01-24 confirmed 941.0
3 2020-01-25 confirmed 1434.0
4 2020-01-26 confirmed 2118.0
... ... ... ...
304 2020-04-29 new_case 75097.0
305 2020-04-30 new_case 84566.0
306 2020-05-01 new_case 86923.0
307 2020-05-02 new_case 83566.0
308 2020-05-03 new_case 79386.0

309 rows × 3 columns

In [16]:
fig = px.line(ww_melt_df, x="date", y="value", color='variable', 
              title="Worldwide Confirmed/Death Cases Over Time")
fig.show()
In [17]:
fig = px.line(ww_melt_df, x="date", y="value", color='variable',
              title="Worldwide Confirmed/Death Cases Over Time (Log scale)",
             log_y=True)
fig.show()
In [18]:
ww_df['mortality'] = ww_df['fatalities'] / ww_df['confirmed']

fig = px.line(ww_df, x="date", y="mortality", 
              title="Worldwide Mortality Rate Over Time")
fig.show()
In [19]:
country_df = train.groupby(['date', 'country'])[['confirmed', 'fatalities']].sum().reset_index()
country_df.tail()
Out[19]:
date country confirmed fatalities
19256 2020-05-03 West Bank and Gaza 353 2
19257 2020-05-03 Western Sahara 6 0
19258 2020-05-03 Yemen 10 2
19259 2020-05-03 Zambia 124 3
19260 2020-05-03 Zimbabwe 34 4
In [20]:
countries = country_df['country'].unique()
print(f'{len(countries)} countries are in dataset:\n{countries}')
187 countries are in dataset:
['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma' 'Burundi'
 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada' 'Central African Republic'
 'Chad' 'Chile' 'China' 'Colombia' 'Comoros' 'Congo (Brazzaville)'
 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus'
 'Czechia' 'Denmark' 'Diamond Princess' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji' 'Finland' 'France'
 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan'
 'Kenya' 'Korea, South' 'Kosovo' 'Kuwait' 'Kyrgyzstan' 'Laos' 'Latvia'
 'Lebanon' 'Liberia' 'Libya' 'Liechtenstein' 'Lithuania' 'Luxembourg'
 'MS Zaandam' 'Madagascar' 'Malawi' 'Malaysia' 'Maldives' 'Mali' 'Malta'
 'Mauritania' 'Mauritius' 'Mexico' 'Moldova' 'Monaco' 'Mongolia'
 'Montenegro' 'Morocco' 'Mozambique' 'Namibia' 'Nepal' 'Netherlands'
 'New Zealand' 'Nicaragua' 'Niger' 'Nigeria' 'North Macedonia' 'Norway'
 'Oman' 'Pakistan' 'Panama' 'Papua New Guinea' 'Paraguay' 'Peru'
 'Philippines' 'Poland' 'Portugal' 'Qatar' 'Romania' 'Russia' 'Rwanda'
 'Saint Kitts and Nevis' 'Saint Lucia' 'Saint Vincent and the Grenadines'
 'San Marino' 'Sao Tome and Principe' 'Saudi Arabia' 'Senegal' 'Serbia'
 'Seychelles' 'Sierra Leone' 'Singapore' 'Slovakia' 'Slovenia' 'Somalia'
 'South Africa' 'South Sudan' 'Spain' 'Sri Lanka' 'Sudan' 'Suriname'
 'Sweden' 'Switzerland' 'Syria' 'Taiwan*' 'Tajikistan' 'Tanzania'
 'Thailand' 'Timor-Leste' 'Togo' 'Trinidad and Tobago' 'Tunisia' 'Turkey'
 'US' 'Uganda' 'Ukraine' 'United Arab Emirates' 'United Kingdom' 'Uruguay'
 'Uzbekistan' 'Venezuela' 'Vietnam' 'West Bank and Gaza' 'Western Sahara'
 'Yemen' 'Zambia' 'Zimbabwe']
In [21]:
target_date = country_df['date'].max()

print('Date: ', target_date)
for i in [1, 10, 100, 1000, 10000]:
    n_countries = len(country_df.query('(date == @target_date) & confirmed > @i'))
    print(f'{n_countries} countries have more than {i} confirmed cases')
Date:  2020-05-03
187 countries have more than 1 confirmed cases
179 countries have more than 10 confirmed cases
142 countries have more than 100 confirmed cases
89 countries have more than 1000 confirmed cases
37 countries have more than 10000 confirmed cases
In [22]:
ax = sns.distplot(np.log10(country_df.query('date == "2020-03-27"')['confirmed'] + 1))
ax.set_xlim([0, 6])
ax.set_xticks(np.arange(7))
_ = ax.set_xticklabels(['0', '10', '100', '1k', '10k', '100k'])
In [23]:
top_country_df = country_df.query('(date == @target_date) & (confirmed > 1000)').sort_values('confirmed', ascending=False)
top_country_melt_df = pd.melt(top_country_df, id_vars='country', value_vars=['confirmed', 'fatalities'])
In [24]:
fig = px.bar(top_country_melt_df.iloc[::-1],
             x='value', y='country', color='variable', barmode='group',
             title=f'Confirmed Cases/Deaths on {target_date}', text='value', height=1500, orientation='h')
fig.show()
In [25]:
top30_countries = top_country_df.sort_values('confirmed', ascending=False).iloc[:30]['country'].unique()
top30_countries_df = country_df[country_df['country'].isin(top30_countries)]
fig = px.line(top30_countries_df,
              x='date', y='confirmed', color='country',
              title=f'Confirmed Cases for top 30 country as of {target_date}')
fig.show()
In [26]:
top30_countries = top_country_df.sort_values('fatalities', ascending=False).iloc[:30]['country'].unique()
top30_countries_df = country_df[country_df['country'].isin(top30_countries)]
fig = px.line(top30_countries_df,
              x='date', y='fatalities', color='country',
              title=f'Fatalities for top 30 country as of {target_date}')
fig.show()
In [27]:
top_country_df = country_df.query('(date == @target_date) & (confirmed > 100)')
top_country_df['mortality_rate'] = top_country_df['fatalities'] / top_country_df['confirmed']
top_country_df = top_country_df.sort_values('mortality_rate', ascending=False)
In [28]:
fig = px.bar(top_country_df[:30].iloc[::-1],
             x='mortality_rate', y='country',
             title=f'Mortality rate HIGH: top 30 countries on {target_date}', text='mortality_rate', height=800, orientation='h')
fig.show()
In [29]:
fig = px.bar(top_country_df[-30:],
             x='mortality_rate', y='country',
             title=f'Mortality rate LOW: top 30 countries on {target_date}', text='mortality_rate', height=800, orientation='h')
fig.show()
In [30]:
all_country_df = country_df.query('date == @target_date')
all_country_df['confirmed_log1p'] = np.log10(all_country_df['confirmed'] + 1)
all_country_df['fatalities_log1p'] = np.log10(all_country_df['fatalities'] + 1)
all_country_df['mortality_rate'] = all_country_df['fatalities'] / all_country_df['confirmed']
In [31]:
fig = px.choropleth(all_country_df, locations="country", 
                    locationmode='country names', color="confirmed_log1p", 
                    hover_name="country", hover_data=["confirmed", 'fatalities', 'mortality_rate'],
                    range_color=[all_country_df['confirmed_log1p'].min(), all_country_df['confirmed_log1p'].max()], 
                    color_continuous_scale="peach", 
                    title='Countries with Confirmed Cases')

# I'd like to update colorbar to show raw values, but this does not work somehow...
# Please let me know if you know how to do this!!
trace1 = list(fig.select_traces())[0]
trace1.colorbar = go.choropleth.ColorBar(
    tickvals=[0, 1, 2, 3, 4, 5],
    ticktext=['1', '10', '100', '1000','10000', '10000'])
fig.show()
In [32]:
fig = px.choropleth(all_country_df, locations="country", 
                    locationmode='country names', color="fatalities_log1p", 
                    hover_name="country", range_color=[0, 4],
                    hover_data=['confirmed', 'fatalities', 'mortality_rate'],
                    color_continuous_scale="peach", 
                    title='Countries with fatalities')
fig.show()
In [33]:
fig = px.choropleth(all_country_df, locations="country", 
                    locationmode='country names', color="mortality_rate", 
                    hover_name="country", range_color=[0, 0.12], 
                    color_continuous_scale="peach", 
                    title='Countries with mortality rate')
fig.show()
In [34]:
n_countries = 20
n_start_death = 10
fatality_top_countires = top_country_df.sort_values('fatalities', ascending=False).iloc[:n_countries]['country'].values
country_df['date'] = pd.to_datetime(country_df['date'])


df_list = []
for country in fatality_top_countires:
    this_country_df = country_df.query('country == @country')
    start_date = this_country_df.query('fatalities > @n_start_death')['date'].min()
    this_country_df = this_country_df.query('date >= @start_date')
    this_country_df['date_since'] = this_country_df['date'] - start_date
    this_country_df['fatalities_log1p'] = np.log10(this_country_df['fatalities'] + 1)
    this_country_df['fatalities_log1p'] -= this_country_df['fatalities_log1p'].values[0]
    df_list.append(this_country_df)

tmpdf = pd.concat(df_list)
tmpdf['date_since_days'] = tmpdf['date_since'] / pd.Timedelta('1 days')
In [35]:
fig = px.line(tmpdf,
              x='date_since_days', y='fatalities_log1p', color='country',
              title=f'Fatalities by country since 10 deaths, as of {target_date}')
fig.add_trace(go.Scatter(x=[0, 21], y=[0, 3], name='Double by 7 days', line=dict(dash='dash', color=('rgb(200, 200, 200)'))))
fig.add_trace(go.Scatter(x=[0, 42], y=[0, 3], name='Double by 14 days', line=dict(dash='dash', color=('rgb(200, 200, 200)'))))
fig.add_trace(go.Scatter(x=[0, 63], y=[0, 3], name='Double by 21 days', line=dict(dash='dash', color=('rgb(200, 200, 200)'))))
fig.show()
In [36]:
country_df['prev_confirmed'] = country_df.groupby('country')['confirmed'].shift(1)
country_df['new_case'] = country_df['confirmed'] - country_df['prev_confirmed']
country_df['new_case'].fillna(0, inplace=True)
top30_country_df = country_df[country_df['country'].isin(top30_countries)]

fig = px.line(top30_country_df,
              x='date', y='new_case', color='country',
              title=f'DAILY NEW Confirmed cases world wide')
fig.show()
In [37]:
country_df['date'] = country_df['date'].apply(str)
country_df['confirmed_log1p'] = np.log1p(country_df['confirmed'])
country_df['fatalities_log1p'] = np.log1p(country_df['fatalities'])

fig = px.scatter_geo(country_df, locations="country", locationmode='country names', 
                     color="confirmed", size='confirmed', hover_name="country", 
                     hover_data=['confirmed', 'fatalities'],
                     range_color= [0, country_df['confirmed'].max()], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Confirmed cases spread Over Time', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
In [38]:
fig = px.scatter_geo(country_df, locations="country", locationmode='country names', 
                     color="fatalities", size='fatalities', hover_name="country", 
                     hover_data=['confirmed', 'fatalities'],
                     range_color= [0, country_df['fatalities'].max()], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Fatalities growth Over Time', color_continuous_scale="portland")
fig.show()
In [39]:
country_df.loc[country_df['new_case'] < 0, 'new_case'] = 0.
fig = px.scatter_geo(country_df, locations="country", locationmode='country names', 
                     color="new_case", size='new_case', hover_name="country", 
                     hover_data=['confirmed', 'fatalities'],
                     range_color= [0, country_df['new_case'].max()], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Daily NEW cases over Time', color_continuous_scale="portland")
fig.show()
In [40]:
for country in countries:
    province = train.query('country == @country')['province'].unique()
    if len(province) > 1:       
        print(f'Country {country} has {len(province)} provinces: {province}')
Country Australia has 8 provinces: ['Australian Capital Territory' 'New South Wales' 'Northern Territory'
 'Queensland' 'South Australia' 'Tasmania' 'Victoria' 'Western Australia']
Country Canada has 12 provinces: ['Alberta' 'British Columbia' 'Manitoba' 'New Brunswick'
 'Newfoundland and Labrador' 'Nova Scotia' 'Ontario'
 'Prince Edward Island' 'Quebec' 'Saskatchewan' 'Northwest Territories'
 'Yukon']
Country China has 33 provinces: ['Anhui' 'Beijing' 'Chongqing' 'Fujian' 'Gansu' 'Guangdong' 'Guangxi'
 'Guizhou' 'Hainan' 'Hebei' 'Heilongjiang' 'Henan' 'Hong Kong' 'Hubei'
 'Hunan' 'Inner Mongolia' 'Jiangsu' 'Jiangxi' 'Jilin' 'Liaoning' 'Macau'
 'Ningxia' 'Qinghai' 'Shaanxi' 'Shandong' 'Shanghai' 'Shanxi' 'Sichuan'
 'Tianjin' 'Tibet' 'Xinjiang' 'Yunnan' 'Zhejiang']
Country Denmark has 3 provinces: ['Faroe Islands' 'Greenland' nan]
Country France has 11 provinces: ['French Guiana' 'French Polynesia' 'Guadeloupe' 'Mayotte' 'New Caledonia'
 'Reunion' 'Saint Barthelemy' 'St Martin' 'Martinique' nan
 'Saint Pierre and Miquelon']
Country Netherlands has 5 provinces: ['Aruba' 'Curacao' 'Sint Maarten' nan 'Bonaire, Sint Eustatius and Saba']
Country US has 54 provinces: ['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Guam' 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Puerto Rico' 'Rhode Island'
 'South Carolina' 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont'
 'Virgin Islands' 'Virginia' 'Washington' 'West Virginia' 'Wisconsin'
 'Wyoming']
Country United Kingdom has 11 provinces: ['Bermuda' 'Cayman Islands' 'Channel Islands' 'Gibraltar' 'Isle of Man'
 'Montserrat' nan 'Anguilla' 'British Virgin Islands'
 'Turks and Caicos Islands' 'Falkland Islands (Malvinas)']
In [43]:
usa_state_code_df = pd.read_csv('./usa_states2.csv')
In [44]:
train_us
Out[44]:
country province date confirmed fatalities country_province
0 US Alabama 2020-01-22 0 0 US/Alabama
1 US Alaska 2020-01-22 0 0 US/Alaska
2 US Arizona 2020-01-22 0 0 US/Arizona
3 US Arkansas 2020-01-22 0 0 US/Arkansas
4 US California 2020-01-22 0 0 US/California
... ... ... ... ... ... ...
5557 US Virginia 2020-05-03 18672 660 US/Virginia
5558 US Washington 2020-05-03 15185 834 US/Washington
5559 US West Virginia 2020-05-03 1195 48 US/West Virginia
5560 US Wisconsin 2020-05-03 7964 339 US/Wisconsin
5561 US Wyoming 2020-05-03 586 7 US/Wyoming

5562 rows × 6 columns

In [45]:
# Prepare data frame only for US. 

#train_us = train.query('country == "US"')
train_us['mortality_rate'] = train_us['fatalities'] / train_us['confirmed']

# Convert province column to its 2-char code name,
state_name_to_code = dict(zip(usa_state_code_df['state_name'], usa_state_code_df['state_code']))
train_us['province_code'] = train_us['province'].map(state_name_to_code)

# Only show latest days.
train_us_latest = train_us.query('date == @target_date')
In [46]:
fig = px.choropleth(train_us_latest, locations='province_code', locationmode="USA-states",
                    color='confirmed', scope="usa", hover_data=['province', 'fatalities', 'mortality_rate'],
                    title=f'Confirmed cases in US on {target_date}')
fig.show()
In [47]:
train_us_latest.sort_values('confirmed', ascending=False)
Out[47]:
country province date confirmed fatalities country_province mortality_rate province_code
5541 US New York 2020-05-03 316415 24708 US/New York 0.078087 NY
5539 US New Jersey 2020-05-03 126744 7871 US/New Jersey 0.062102 NJ
5530 US Massachusetts 2020-05-03 68087 4004 US/Massachusetts 0.058807 MA
5522 US Illinois 2020-05-03 61499 2618 US/Illinois 0.042570 IL
5512 US California 2020-05-03 54903 2216 US/California 0.040362 CA
5547 US Pennsylvania 2020-05-03 51225 2720 US/Pennsylvania 0.053099 PA
5531 US Michigan 2020-05-03 43801 4053 US/Michigan 0.092532 MI
5517 US Florida 2020-05-03 36078 1379 US/Florida 0.038223 FL
5553 US Texas 2020-05-03 31998 878 US/Texas 0.027439 TX
5527 US Louisiana 2020-05-03 29340 2012 US/Louisiana 0.068575 LA
5514 US Connecticut 2020-05-03 29287 2436 US/Connecticut 0.083177 CT
5518 US Georgia 2020-05-03 28665 1184 US/Georgia 0.041305 GA
5529 US Maryland 2020-05-03 25462 1281 US/Maryland 0.050310 MD
5523 US Indiana 2020-05-03 19933 1132 US/Indiana 0.056790 IN
5544 US Ohio 2020-05-03 19914 1039 US/Ohio 0.052174 OH
5557 US Virginia 2020-05-03 18672 660 US/Virginia 0.035347 VA
5513 US Colorado 2020-05-03 16635 842 US/Colorado 0.050616 CO
5558 US Washington 2020-05-03 15185 834 US/Washington 0.054923 WA
5552 US Tennessee 2020-05-03 13177 210 US/Tennessee 0.015937 TN
5542 US North Carolina 2020-05-03 11770 434 US/North Carolina 0.036873 NC
5549 US Rhode Island 2020-05-03 9477 320 US/Rhode Island 0.033766 RI
5524 US Iowa 2020-05-03 9175 184 US/Iowa 0.020054 IA
5510 US Arizona 2020-05-03 8640 362 US/Arizona 0.041898 AZ
5534 US Missouri 2020-05-03 8618 381 US/Missouri 0.044210 MO
5560 US Wisconsin 2020-05-03 7964 339 US/Wisconsin 0.042567 WI
5508 US Alabama 2020-05-03 7888 290 US/Alabama 0.036765 AL
5533 US Mississippi 2020-05-03 7550 303 US/Mississippi 0.040132 MS
5532 US Minnesota 2020-05-03 6663 418 US/Minnesota 0.062735 MN
5550 US South Carolina 2020-05-03 6626 275 US/South Carolina 0.041503 SC
5536 US Nebraska 2020-05-03 5661 78 US/Nebraska 0.013778 NE
5537 US Nevada 2020-05-03 5472 258 US/Nevada 0.047149 NV
5515 US Delaware 2020-05-03 5208 177 US/Delaware 0.033986 DE
5554 US Utah 2020-05-03 5175 50 US/Utah 0.009662 UT
5525 US Kansas 2020-05-03 5156 144 US/Kansas 0.027929 KS
5526 US Kentucky 2020-05-03 5130 253 US/Kentucky 0.049318 KY
5516 US District of Columbia 2020-05-03 5016 251 US/District of Columbia 0.050040 DC
5545 US Oklahoma 2020-05-03 3972 238 US/Oklahoma 0.059919 OK
5540 US New Mexico 2020-05-03 3850 151 US/New Mexico 0.039221 NM
5511 US Arkansas 2020-05-03 3437 76 US/Arkansas 0.022112 AR
5546 US Oregon 2020-05-03 2680 109 US/Oregon 0.040672 OR
5551 US South Dakota 2020-05-03 2631 21 US/South Dakota 0.007982 SD
5538 US New Hampshire 2020-05-03 2518 86 US/New Hampshire 0.034154 NH
5521 US Idaho 2020-05-03 2061 64 US/Idaho 0.031053 ID
5548 US Puerto Rico 2020-05-03 1808 97 US/Puerto Rico 0.053650 PR
5559 US West Virginia 2020-05-03 1195 48 US/West Virginia 0.040167 WV
5543 US North Dakota 2020-05-03 1191 25 US/North Dakota 0.020991 ND
5528 US Maine 2020-05-03 1185 57 US/Maine 0.048101 ME
5555 US Vermont 2020-05-03 897 53 US/Vermont 0.059086 VT
5520 US Hawaii 2020-05-03 620 17 US/Hawaii 0.027419 HI
5561 US Wyoming 2020-05-03 586 7 US/Wyoming 0.011945 WY
5535 US Montana 2020-05-03 455 16 US/Montana 0.035165 MT
5509 US Alaska 2020-05-03 368 9 US/Alaska 0.024457 AK
5519 US Guam 2020-05-03 145 5 US/Guam 0.034483 GU
5556 US Virgin Islands 2020-05-03 66 4 US/Virgin Islands 0.060606 VI
In [48]:
fig = px.choropleth(train_us_latest, locations='province_code', locationmode="USA-states",
                    color='mortality_rate', scope="usa", hover_data=['province', 'fatalities', 'mortality_rate'],
                    title=f'Mortality rate in US on {target_date}')
fig.show()
In [49]:
train_us_march = train_us.query('date > "2020-03-01"')
fig = px.line(train_us_march,
              x='date', y='confirmed', color='province',
              title=f'Confirmed cases by state in US, as of {target_date}')
fig.show()
In [50]:
# Ref: https://www.kaggle.com/abhinand05/covid-19-digging-a-bit-deeper
europe_country_list =list([
    'Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
    'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
    'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus',
    'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia'])

country_df['date'] = pd.to_datetime(country_df['date'])
train_europe = country_df[country_df['country'].isin(europe_country_list)]
#train_europe['date_str'] = pd.to_datetime(train_europe['date'])
train_europe_latest = train_europe.query('date == @target_date')
In [51]:
fig = px.choropleth(train_europe_latest, locations="country", 
                    locationmode='country names', color="confirmed", 
                    hover_name="country", range_color=[1, train_europe_latest['confirmed'].max()], 
                    color_continuous_scale='portland', 
                    title=f'European Countries with Confirmed Cases as of {target_date}', scope='europe', height=800)
fig.show()
In [52]:
train_europe_march = train_europe.query('date >= "2020-03-01"')
fig = px.line(train_europe_march,
              x='date', y='confirmed', color='country',
              title=f'Confirmed cases by country in Europe, as of {target_date}')
fig.show()
In [53]:
fig = px.line(train_europe_march,
              x='date', y='fatalities', color='country',
              title=f'Fatalities by country in Europe, as of {target_date}')
fig.show()
In [54]:
train_europe_march['prev_confirmed'] = train_europe_march.groupby('country')['confirmed'].shift(1)
train_europe_march['new_case'] = train_europe_march['confirmed'] - train_europe_march['prev_confirmed']
fig = px.line(train_europe_march,
              x='date', y='new_case', color='country',
              title=f'DAILY NEW Confirmed cases by country in Europe')
fig.show()
In [55]:
country_latest = country_df.query('date == @target_date')

fig = px.choropleth(country_latest, locations="country", 
                    locationmode='country names', color="confirmed", 
                    hover_name="country", range_color=[1, 50000], 
                    color_continuous_scale='portland', 
                    title=f'Asian Countries with Confirmed Cases as of {target_date}', scope='asia', height=800)
fig.show()
In [56]:
top_asian_country_df = country_df[country_df['country'].isin(['China', 'Indonesia', 'Iran', 'Japan', 'Korea, South', 'Malaysia', 'Philippines'])]

fig = px.line(top_asian_country_df,
              x='date', y='new_case', color='country',
              title=f'DAILY NEW Confirmed cases world wide')
fig.show()
In [57]:
max_confirmed = country_df.groupby('country')['new_case'].max().reset_index()
country_latest = pd.merge(country_latest, max_confirmed.rename({'new_case': 'max_new_case'}, axis=1))
country_latest['new_case_peak_to_now_ratio'] = country_latest['new_case'] / country_latest['max_new_case']
In [58]:
recovering_country = country_latest.query('new_case_peak_to_now_ratio < 0.5')
major_recovering_country = recovering_country.query('confirmed > 100')
In [59]:
fig = px.bar(major_recovering_country.sort_values('new_case_peak_to_now_ratio', ascending=False),
             x='new_case_peak_to_now_ratio', y='country',
             title=f'Mortality rate LOW: top 30 countries on {target_date}', text='new_case_peak_to_now_ratio', height=1000, orientation='h')
fig.show()
In [60]:
fig = px.choropleth(country_latest, locations="country", 
                    locationmode='country names', color="new_case_peak_to_now_ratio", 
                    hover_name="country", range_color=[0, 1], 
                    # color_continuous_scale="peach", 
                    hover_data=['confirmed', 'fatalities', 'new_case', 'max_new_case'],
                    title='Countries with new_case_peak_to_now_ratio')
fig.show()
In [61]:
china_df = train.query('country == "China"')
china_df['prev_confirmed'] = china_df.groupby('province')['confirmed'].shift(1)
china_df['new_case'] = china_df['confirmed'] - china_df['prev_confirmed']
china_df.loc[china_df['new_case'] < 0, 'new_case'] = 0.
In [62]:
fig = px.line(china_df,
              x='date', y='new_case', color='province',
              title=f'DAILY NEW Confirmed cases in China by province')
fig.show()
In [63]:
def sigmoid(t, M, beta, alpha, offset=0):
    alpha += offset
    return M / (1 + np.exp(-beta * (t - alpha)))

def error(x, y, params):
    M, beta, alpha = params
    y_pred = sigmoid(x, M, beta, alpha)

    # apply weight, latest number is more important than past.
    weight = np.arange(len(y_pred)) ** 2
    loss_mse = np.mean((y_pred - y) ** 2 * weight)
    return loss_mse

def gen_random_color(min_value=0, max_value=256) -> str:
    """Generate random color for plotly"""
    r, g, b = np.random.randint(min_value, max_value, 3)
    return f'rgb({r},{g},{b})'
In [64]:
def fit_sigmoid(exclude_days=0):
    target_country_df_list = []
    pred_df_list = []
    for target_country in top30_countries:
        print('target_country', target_country)
        # --- Train ---
        target_country_df = country_df.query('country == @target_country')

        #train_start_date = target_country_df['date'].min()
        train_start_date = target_country_df.query('confirmed > 1000')['date'].min()
        train_end_date = pd.to_datetime(target_date) - pd.Timedelta(f'{exclude_days} days')
        target_date_df = target_country_df.query('(date >= @train_start_date) & (date <= @train_end_date)')
        if len(target_date_df) <= 7:
            print('WARNING: the data is not enough, use 7 more days...')
            train_start_date -= pd.Timedelta('7 days')
            target_date_df = target_country_df.query('(date >= @train_start_date) & (date <= @train_end_date)')

        confirmed = target_date_df['confirmed'].values
        x = np.arange(len(confirmed))

        lossfun = lambda params: error(x, confirmed, params)
        res = sp.optimize.minimize(lossfun, x0=[np.max(confirmed) * 5, 0.04, 2 * len(confirmed) / 3.], method='nelder-mead')
        M, beta, alpha = res.x
        # sigmoid_models[key] = (M, beta, alpha)
        # np.clip(sigmoid(list(range(len(data), len(data) + steps)), M, beta, alpha), 0, None).astype(int)

        # --- Pred ---
        pred_start_date = target_country_df['date'].min()
        pred_end_date = pd.to_datetime('2020-07-01')
        days = int((pred_end_date - pred_start_date) / pd.Timedelta('1 days'))
        # print('pred start', pred_start_date, 'end', pred_end_date, 'days', days)

        x = np.arange(days)
        offset = (train_start_date - pred_start_date) / pd.Timedelta('1 days')
        print('train_start_date', train_start_date, 'offset', offset, 'params', M, beta, alpha)
        y_pred = sigmoid(x, M, beta, alpha, offset=offset)
        # target_country_df['confirmed_pred'] = y_pred

        all_dates = [pred_start_date + np.timedelta64(x, 'D') for x in range(days)]
        pred_df = pd.DataFrame({
            'date': all_dates,
            'country': target_country,
            'confirmed_pred': y_pred,
        })

        target_country_df_list.append(target_country_df)
        pred_df_list.append(pred_df)
    return target_country_df_list, pred_df_list
In [65]:
def plot_sigmoid_fitting(target_country_df_list, pred_df_list, title=''):
    n_countries = len(top30_countries)

    # --- Plot ---
    fig = go.Figure()

    for i in range(n_countries):
        target_country = top30_countries[i]
        target_country_df = target_country_df_list[i]
        pred_df = pred_df_list[i]
        color = gen_random_color(min_value=20)
        # Prediction
        fig.add_trace(go.Scatter(
            x=pred_df['date'], y=pred_df['confirmed_pred'],
            name=f'{target_country}_pred',
            line=dict(color=color, dash='dash')
        ))

        # Ground truth
        fig.add_trace(go.Scatter(
            x=target_country_df['date'], y=target_country_df['confirmed'],
            mode='markers', name=f'{target_country}_actual',
            line=dict(color=color),
        ))
    fig.update_layout(
        title=title, xaxis_title='Date', yaxis_title='Confirmed cases')
    fig.show()
In [66]:
target_country_df_list, pred_df_list = fit_sigmoid(exclude_days=0)
target_country US
train_start_date 2020-03-11 00:00:00 offset 49.0 params 1374784.5880831727 0.09694156768951208 36.89728685748285
target_country Italy
train_start_date 2020-02-29 00:00:00 offset 38.0 params 222378.9611780029 0.08703298651409877 33.06836675858487
target_country United Kingdom
train_start_date 2020-03-14 00:00:00 offset 52.0 params 221896.13822112326 0.10170524355725563 34.47622458256889
target_country Spain
train_start_date 2020-03-09 00:00:00 offset 47.0 params 219800.7310559287 0.12901173545368036 24.506797557279334
target_country France
train_start_date 2020-03-08 00:00:00 offset 46.0 params 174286.65706621396 0.14859162540605303 30.233606692184402
target_country Belgium
train_start_date 2020-03-16 00:00:00 offset 54.0 params 54306.823897585185 0.10957922750617993 25.640988954015796
target_country Brazil
train_start_date 2020-03-21 00:00:00 offset 59.0 params 431034.8314883252 0.08527793230094835 56.50684603618325
target_country Germany
train_start_date 2020-03-08 00:00:00 offset 46.0 params 167389.4553444392 0.12072147591279209 25.46433364723733
target_country Iran
train_start_date 2020-03-02 00:00:00 offset 40.0 params 100894.05332051817 0.0960538351457054 31.775465004298397
target_country Netherlands
train_start_date 2020-03-15 00:00:00 offset 53.0 params 43945.36072348419 0.10537771183365907 24.892372087372216
target_country China
train_start_date 2020-01-25 00:00:00 offset 3.0 params 83060.22244421643 0.17856007703803328 14.200502715311664
target_country Canada
train_start_date 2020-03-21 00:00:00 offset 59.0 params 81744.33830926286 0.08796406727695136 31.856643300866708
target_country Turkey
train_start_date 2020-03-22 00:00:00 offset 60.0 params 135423.34176381945 0.1351895659970206 23.751000567060544
target_country Sweden
train_start_date 2020-03-15 00:00:00 offset 53.0 params 30581.26321249156 0.08060409275425513 36.4828089359149
target_country Mexico
train_start_date 2020-03-30 00:00:00 offset 68.0 params 61928.84616677843 0.09800806412976887 39.05682755020109
target_country Switzerland
train_start_date 2020-03-13 00:00:00 offset 51.0 params 30052.563918275486 0.1226450231162689 16.290828566726947
target_country Ecuador
train_start_date 2020-03-24 00:00:00 offset 62.0 params 39298.509456222935 0.14315822178687423 32.40422213817178
target_country India
train_start_date 2020-03-29 00:00:00 offset 67.0 params 92504.33221995397 0.09318485730822801 37.069396301956516
target_country Ireland
train_start_date 2020-03-23 00:00:00 offset 61.0 params 22962.46927815873 0.13777952464666932 22.257453028073105
target_country Peru
train_start_date 2020-03-31 00:00:00 offset 69.0 params 226296.96988618802 0.09098820034153479 48.002253662313564
target_country Russia
train_start_date 2020-03-27 00:00:00 offset 65.0 params 231458.84190826412 0.129514401139824 34.888316945385355
target_country Portugal
train_start_date 2020-03-20 00:00:00 offset 58.0 params 27392.354334892218 0.10641709465692861 19.68526172798131
target_country Indonesia
train_start_date 2020-03-27 00:00:00 offset 65.0 params 15168.92688620981 0.09341460001609718 26.29173653197386
target_country Romania
train_start_date 2020-03-26 00:00:00 offset 64.0 params 16402.773230401493 0.08865960442524708 22.757662118268726
target_country Poland
train_start_date 2020-03-25 00:00:00 offset 63.0 params 16911.047669323932 0.09035170116843923 23.241418970734085
target_country Philippines
train_start_date 2020-03-28 00:00:00 offset 66.0 params 14692.302837536303 0.05909060637725254 27.64401336239456
target_country Austria
train_start_date 2020-03-16 00:00:00 offset 54.0 params 15508.26201310059 0.1380400481915552 11.159956982425877
target_country Japan
train_start_date 2020-03-21 00:00:00 offset 59.0 params 15651.431349094906 0.1474884821830746 24.134769351763005
target_country Denmark
train_start_date 2020-03-17 00:00:00 offset 55.0 params 10700.436909071384 0.08876689211448777 22.509930829533083
target_country Algeria
train_start_date 2020-04-03 00:00:00 offset 72.0 params 29362.03094853006 0.04466635881520388 68.4603725881056
In [67]:
plot_sigmoid_fitting(target_country_df_list, pred_df_list, title='Sigmoid fitting with all latest data')
In [68]:
plot_sigmoid_fitting(target_country_df_list, pred_df_list, title='Sigmoid fitting without last 7days data')
In [ ]: